# This script reads the NewsWhip data, applies various preprocessing, samples
# down to a reasonable size, and then outputs a Parquet file for later analyses.
#
# Run restructure_newswhip_data.py first to create the Parquet dataset.
library(tidyverse)
library(arrow)
library(stm)
library(tm)
library(quanteda)
source("src/media.R")

set.seed(1683767830)
newswhip <- arrow::open_dataset("data/processed/newswhip")

# One of the major preprocessing steps is cutting out wire reports. We want
# to focus on original work. Breitbart, for example, runs a ton of AP wire
# reports, and that's not what we're interested in.
is_wire_report <- function(context, x) {
  case_when(
    x == "AP News" ~ TRUE,
    x == "AP" ~ TRUE,
    x == "AFP" ~ TRUE,
    x == "UPI" ~ TRUE,
    x == "AP0" ~ TRUE,
    x == "UPI0" ~ TRUE,
    x == "AFP0" ~ TRUE,
    str_detect(x, "Associated Press") ~ TRUE,
    x == "Reuters News" ~ TRUE,
    x == "AP Reports" ~ TRUE,
    x == "another news agency" ~ TRUE,
    str_detect(x, "AP Published") ~ TRUE,
    .default = FALSE
  )
}
register_scalar_function(
  name = "is_wire_report",
  fun = is_wire_report,
  in_type = utf8(),
  out_type = bool(),
  auto_convert = TRUE
)

newswhip <- newswhip |>
  filter(year >= 2017, year <= 2021, source.language=="en") |>
  mutate(headline = str_replace_all(headline, "\\| RealClear\\w+$", ""),
            excerpt = str_replace_all(excerpt, "\\| RealClear\\w+$", "")) |>
  mutate(original_headline = headline,
         headline = str_to_lower(headline),
         excerpt = str_to_lower(excerpt),
         article = str_to_lower(article)) |>
  mutate(headline_and_blurb = str_c(headline, excerpt, sep = " ")) |>
  mutate(headline_and_blurb = str_replace_all(headline_and_blurb, "\\s+", " ")) |>
  mutate(day = creation_date - ymd("20170101")) |>
  filter(!is_wire_report(authors)) |>
  select(uuid, link, source.domain, creation_date,
         day, headline, article,
         excerpt, authors, headline_and_blurb, topics,
         fb_data.total_engagement_count) |>
  filter(!is.na(headline_and_blurb)) |>
  filter(headline_and_blurb != "") |>
  filter(fb_data.total_engagement_count > 10) |>
#  slice_sample(prop = 0.1) |>
  collect() |>
  mutate(day = floor(as.numeric(day, "days")))|>
  # We're going to filter out Sports, Culture and Entertainment content
  # rather than subsetting to "News". Unfortunately, NewsWhip topics are pretty
  # inconsistent from publisher to publisher so we can't count on this to
  # do all the filtering we need
  #
  # 32 Sports
  # 3 Entertainment
  # 4 Culture
  mutate(vals = lapply(topics, \(x) any(c(3, 4, 32) %in% pluck(x, "id")))) |>
  filter(vals == FALSE) |>
  select(-topics, -vals) |>
  # NOTE: this cuts the data considerably
  # but removes <1% of engagement
  # TODO: There should probably be an arrange() before the distinct().
  #       When there are multiple rows for a single URL, which is to be preferred?
  #       They seem to have the same timestamps.
  distinct(link, .keep_all = TRUE) |>
  # drop some domains we don't want to use right now
  filter(!(source.domain %in% c("babylonbee.com", "oann.com", "fox5dc.com", "wjla.com", "heatst.com",
                                "thebulwark.com", "thedailysheeple.com", "disobedientmedia.com",
                                "intellihub.com", "trendingpolitics.com", "truthfeed.com"))) |>
  mutate(publisher = recode_domains(source.domain),
         grouping = code_categories(publisher))




# NewsWhip source language isn't good enough - bring in fastText to
# get rid of some of the extras. If we don't do this, we'll get the classic
# "other language" topic.

language_ids <- fastText::language_identification(newswhip$headline_and_blurb,
  pre_trained_language_model_path = "data/raw/fasttext_full_langid_model.bin")

newswhip <- cbind(newswhip, language_ids) |>
  filter(iso_lang_1 == "en")

corpus <- quanteda::corpus(newswhip$headline_and_blurb)

EXTRA_STOPWORDS <- c("video", "getty", "advertisement", "say", "said")
tokens <- quanteda::tokens(corpus, what = "word",
                           remove_punct = TRUE,
                           remove_symbols = TRUE,
                           remove_numbers = TRUE,
                           split_hyphens = TRUE) |>
  tokens_remove(stopwords::stopwords("en", source="smart")) |>
  tokens_remove(EXTRA_STOPWORDS)
newswhip$headline_and_blurb <- as.list(tokens) |> map_chr(str_c, collapse=" ")
#
# # Extra text processing
# # I don't like how textProcessor tokenizes so going to do some of this myself.
# # This is mostly removing numbers and splitting on non-word characters, so, e.g.,
# # "COVID-19" becomes "COVID" and "28-year-old" becomes "year old"
# # It's also removing short words, e.g. "be". A bunch of these will get removed
# # as stopwords later, anyway.
# cleanup <- function (x) {
#    x |>
#     str_replace_all("\\W+", " ") |>
#     str_replace_all("[0-9]+", "") |>
#     str_replace_all("\\s+\\w{2}\\s+", " ") |>
#     str_replace_all("\\s+\\w{1}\\s+", " ") |>
#     str_replace_all("\\-", " ") |>
#     str_replace_all("Advertisement", " ")
# }

# TODO: expand. Not covering all cases right now.
# Identify cases where the aritcle text is just an error
replace_bad_articles <- function (x) {
  case_when(
    str_detect(x, "Article download failed") ~ NA,
    .default = x
  )
}

# There's a lot of domain-specific language that will get thrown into its own
# topics if we don't clean them up. This function should ultimately provide
# regexes for each domain to identify what is actual article text.
domain_specific_cleanup <- function(d, x) {
  case_when(
 #   d == "bbc.com" ~ str_extract(x, ".*Published\\d{2} \\w+ \\d{4}(?:SharecloseShare)?(?: pageCopy)?\\s?(?:linkAbout)(?: sharingimage)?\\s?(?:copyright[\\w\\s]+image)?\\s?(?:caption)?(.*)Related Topics.*", group = 1),
    .default = x
  )
}

newswhip <- newswhip |>
  #mutate_at(vars(headline_and_blurb, article), cleanup) |>
  mutate_at(vars(article), replace_bad_articles) |>
#  mutate(article = map2_chr(source.domain, article, domain_specific_cleanup)) |>
  mutate(short_article = stringr::word(article, 1, 25)) |>
  mutate(headline_and_short_article = str_c(headline, short_article)) |>
  filter(!is.na(publisher))

mask <- sample(nrow(newswhip), floor(nrow(newswhip)*0.1))
newswhip$estimation_set <- 0
newswhip[mask,]$estimation_set <- 1

sampled_newswhip <- filter(newswhip, estimation_set == 1)


write_parquet(newswhip, "data/processed/filtered_newswhip.parquet")
write_parquet(sampled_newswhip, "data/processed/filtered_newswhip_sample.parquet")
